import pandas as pd
#provaxxers
provaxxers = pd.read_csv('./datasets/kdmile/provaxxers.csv', low_memory=False)
#antivaxxers
#antivaxxers = pd.read_csv('./datasets/kdmile/antivaxxers.csv', low_memory=False)
docs = provaxxers
from umap import UMAP
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')
from bertopic import BERTopic
topic_model = BERTopic(language = 'multilingual',
top_n_words=10,
n_gram_range=(1, 2),
min_topic_size=55,
nr_topics = 'auto',
umap_model=umap_model,
low_memory=True,
calculate_probabilities=False,
verbose=True)
topics, probs = topic_model.fit_transform(docs.text)
2021-08-07 12:17:36,067 - BERTopic - Transformed documents to Embeddings 2021-08-07 12:26:54,689 - BERTopic - Reduced dimensionality with UMAP 2021-08-07 12:27:16,542 - BERTopic - Clustered UMAP embeddings with HDBSCAN 2021-08-07 12:27:46,024 - BERTopic - Reduced number of topics from 354 to 196
topic_model.visualize_topics()
newTopics, newProbs = topic_model.reduce_topics(docs.text, topics, probs, nr_topics=9)
2021-08-07 12:28:31,366 - BERTopic - Reduced number of topics from 196 to 10
topic_model.visualize_topics()
timestamps = docs.created_at.to_list()
tweets = docs.text.to_list()
topics_over_time = topic_model.topics_over_time(docs=tweets,
topics=newTopics,
timestamps=timestamps,
global_tuning=True,
evolution_tuning=True,
nr_bins=20)
20it [01:28, 4.42s/it]
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=30)